
# %%
import pandas as pd
import csv

# Read the TSV file into a DataFrame
df = pd.read_csv('pblocks.tsv', sep='\t')

# Split the 'anchor' column and create a new 'gene_name' column
df['gene_name'] = df['anchor'].str.split('-').str[0]

# Create a new DataFrame with unique gene names
unique_gene_names = pd.DataFrame({'gene_name': df['gene_name'].unique()})

# Print the unique gene names
print(unique_gene_names)



# %%

# Define the input GTF file and output CSV file
gtf_file = 'gencode.v42.basic.annotation.gtf'  # Replace with the actual file path
csv_file = 'gene_names.csv'  # Replace with the desired output file name

# Create a set to store unique gene names
gene_names = set()

# Read the GTF file and extract unique gene names
with open(gtf_file, 'r') as gtf:
    for line in gtf:
        if line.startswith('#'):
            continue  # Skip comment lines
        fields = line.strip().split('\t')
        attributes = fields[8].split(';')
        for attr in attributes:
            attr = attr.strip()
            if attr.startswith('gene_name "'):
                gene_name = attr[11:-1]  # Extract the gene name
                # Check if the gene name is not an ENSG ID
                if not gene_name.startswith('ENSG'):
                    gene_names.add(gene_name)

# Write the unique gene names to a CSV file
with open(csv_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Gene Name'])  # Write header
    for gene_name in gene_names:
        writer.writerow([gene_name])

print(f'Unique gene names extracted and saved to {csv_file}')


# %%


# Define the file paths
genes_with_APPRIS_file = 'genes_with_APPRIS.csv'
gene_names_file = 'gene_names.csv'

# Read the CSV files into DataFrames
df_APPRIS = pd.read_csv(genes_with_APPRIS_file)
df_gene_names = pd.read_csv(gene_names_file)

# Extract the values from the first column of 'genes_with_APPRIS.csv'
APPRI_gene_column = df_APPRIS.iloc[:, 0]

# Extract the values from the first column of 'gene_names.csv'
gene_names_column = df_gene_names.iloc[:, 0]

# Find the rows in 'genes_with_APPRIS.csv' that are not in 'gene_names.csv'
missing_rows = df_APPRIS[~APPRI_gene_column.isin(gene_names_column)]

# Print or manipulate the missing rows as needed
print("Rows in 'genes_with_APPRIS.csv' but not in 'gene_names.csv':")
print(missing_rows)



# %%

# Read the TSV file into a DataFrame
df = pd.read_csv('pblocks.tsv', sep='\t')

# Split the 'anchor' column and create a new 'gene_name' column
df['gene_name'] = df['anchor'].str.split('-').str[0]

# Create a new DataFrame with unique gene names
unique_gene_names = pd.DataFrame(df['gene_name'].unique())

# Print the unique gene names
print(unique_gene_names)


# Define the file paths
genes_with_APPRIS_file = 'genes_with_APPRIS.csv'
gene_names_file = 'gene_names.csv'

# Read the CSV files into DataFrames
df_APPRIS = pd.read_csv(genes_with_APPRIS_file)
df_gene_names = pd.read_csv(gene_names_file)

# Extract the values from the first column of 'genes_with_APPRIS.csv'
APPRI_gene_column = df_APPRIS.iloc[:, 0]

# Extract the values from the first column of 'gene_names.csv'
gene_names_column = unique_gene_names.iloc[:, 0]

# Find the rows in 'genes_with_APPRIS.csv' that are not in 'gene_names.csv'
missing_rows = df_APPRIS[~APPRI_gene_column.isin(gene_names_column)]

# Print or manipulate the missing rows as needed
print("Rows in 'genes_with_APPRIS.csv' but not in 'gene_names.csv':")
print(missing_rows)


# %%
csv_file_path = 'genes_without_APPRIS.csv'

# Use the to_csv() method to save the DataFrame to a CSV file
missing_rows.to_csv(csv_file_path, index=False)


